It has been well documented that live birth follows a seasonal pattern with more babies born in Spring/Summer in most countries of the northen Hemisphere.
We downloaded monthly live birth data from the UN [http://data.un.org/Data.aspx?d=POP&f=tableCode%3A55]
birth = read.csv(paste0(IO$birth_records_dir,"UNdata_Export_20180713_023958101_all.csv"), header = TRUE)
# removing the rows with the total number of birth - only keeping the monthly data
months = format(ISOdate(2004,1:12,1),"%B")
j = birth$Month %in% months
birth = birth[which(j),]
rm(j)
# proper date
birth$date = as.Date(paste0(birth$Year, "-",birth$Month,"-01"), format = c("%Y-%B-%d"))
range(birth$date)## [1] "1967-01-01" "2017-12-01"
# matching country names with countriesLow Names
birth$Country.or.Area = gsub("United States of America","United States", birth$Country.or.Area)
# getting geo-location
mat = match(birth$Country.or.Area, countriesLow$NAME)
birth$lat = countriesLow$LAT[mat]
birth$lon = countriesLow$LON[mat]
rm(mat)
# sorting countries by latitude
countries.levels = unique(birth$Country.or.Area)
m = match(countries.levels,countriesLow$NAME )
lat = countriesLow$LAT[m]
o = order(lat, decreasing = TRUE)
countries.levels = countries.levels[o]
birth$Country.or.Area = factor(birth$Country.or.Area, levels = countries.levels)
save(birth, file = paste0(IO$out_Rdata,"UN_birth_data.Rdata"))
rm(countries.levels, m, lat, o)# "Australia", "New Zealand","Bermuda", "Greenland", "Guadeloupe",
countries = c("Sweden","Denmark","Finland","Canada","Germany", "France", "Switzerland", "United States","Israel","Japan","Guatemala","American Samoa","New Caledonia","Seychelles","Chile")
# order countries by decreasing latitude
mat = match(countries, countriesLow$NAME)
lat = countriesLow$LAT[mat]
o = order(lat, decreasing = TRUE)
countries = countries[o]
rm(o, lat, mat)
# select countries. Selected countries had long term data and included those from Northern and Southern hemisphere
c = which(birth$Country.or.Area %in% countries)
g = ggplot(birth[c,], aes(x = date, y = Value, col = Country.or.Area))
g + geom_line() + ggtitle("Absolute number of live birth for selected countries")g = ggplot(birth[c,], aes(x = date, y = Value, col = Country.or.Area))
g + geom_line() + facet_grid( Country.or.Area ~ . , scales = "free_y") +
theme(strip.text.y = element_text(angle = 90)) +
ggtitle("Number of live birth for selected countries /!\ y-scale does not include 0")countries2 = countries[-which(countries %in% c("Guatemala","Switzerland","Finland"))]
cc = which((birth$Country.or.Area %in% countries2) & (birth$Year >= 2000))
g = ggplot(birth[cc,], aes(x = date, y = Value, col = Country.or.Area))
g + geom_line() + facet_grid( Country.or.Area ~ . , scales = "free_y") +
theme(strip.text.y = element_text(angle = 0,hjust = 0),
axis.text.y = element_blank(),
axis.ticks.y = element_blank())+ #expand_limits(y=25)+
guides(col = FALSE)+ ylab("Number of live births (min to max values)")+
ggtitle("Number of live birth for selected countries in the last 2 decades")long_ago = 1978:1981
cc = which((birth$Country.or.Area %in% c("Finland","France","United States", "Israel")) & (birth$Year %in% long_ago))
g = ggplot(birth[cc,], aes(x = date, y = Value, col = Country.or.Area))
g + geom_line() + facet_grid( Country.or.Area ~ . , scales = "free_y") +
theme(strip.text.y = element_text(angle = 90)) +
ggtitle("Number of live birth for selected countries in the late 70's")cc = which((birth$Country.or.Area %in% countries) & (birth$Year %in% long_ago))
g = ggplot(birth[cc,], aes(x = date, y = Value, col = Country.or.Area))
g + geom_line() + facet_grid( Country.or.Area ~ . , scales = "free_y") +
theme(strip.text.y = element_text(angle = 90)) +
ggtitle("Number of live birth for selected countries in the late 70's")recently = 2011:2013
cc = which((birth$Country.or.Area %in% c("Finland","France","United States", "Israel")) & (birth$Year %in% recently))
g = ggplot(birth[cc,], aes(x = date, y = Value, col = Country.or.Area))
g + geom_line() + facet_grid( Country.or.Area ~ . , scales = "free_y") +
theme(strip.text.y = element_text(angle = 90)) +
ggtitle("Number of live birth for selected countries in recent years")sh = which(birth$lat < 0)
g = ggplot(birth[sh,], aes(x = date, y = Value, col = Country.or.Area))
g + geom_line() + facet_grid( Country.or.Area ~ . , scales = "free_y") +
theme(strip.text.y = element_text(angle = 90)) +
ggtitle("Number of live birth for selected countries in the SOUTH hemisphere")rm(sh)Martinez-Bakker et al., 2014 have shown that the peak time of birth follows a pattern that depends on the latitude of the considered country.
To better understand this relationship and to later investigate the potential origin of birth seasonality, we do a rhythm analysis of the birth patterns to determine the phase (= the peak time) and the relative amplitude of the birth signals, taken year by year.
BR = data.frame()
for(country in unique(birth$Country.or.Area)){
c = which(birth$Country.or.Area == country)
b = birth[c,]
years = table(b$Year)
years = names(years)[years == 12]
for(y in years){
j = which(b$Year == y)
pft = periodic.fisher.test(t = 1:12, x = b$Value[j], p = 12)
line = data.frame(country = country, year = as.numeric(y), pval = pft$pval , phase = pft$phase, rel.ampl = pft$rel.amp)
BR = rbind(BR, line)
}
}
rm(line,c, b, j,y, country, pft, years)
hist(BR$pval, breaks = 50)# we expect p-value to be small if there is rhythmicity
BR$qval = p.adjust(BR$pval, method = "BH")# and plot the phase (in month), show the p-value and order by latitude.
m = match(BR$country, countriesLow$NAME)
BR$lat = countriesLow$LAT[m]
BR$lon = countriesLow$LON[m]
BR$significant_rhythm = BR$qval<=0.1
BR$month = BR$phase
BR$latitude = BR$lat
rm(m)
g = ggplot(BR, aes(x = phase, y= lat, col = year, alpha = (qval<=0.1)/2+0.3))
g + geom_point() + xlim(c(0,12)) + scale_x_continuous(breaks = 0:12) +
ggtitle("Peak month of birth by latitude and years")## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
## Warning: Removed 658 rows containing missing values (geom_point).
g = ggplot(BR, aes(x = phase, y= lat, col = year, alpha = qval))
g + geom_point() + xlim(c(0,12)) + scale_x_continuous(breaks = 0:12) +
scale_alpha(range = c(0.1,1))## Scale for 'x' is already present. Adding another scale for 'x', which
## will replace the existing scale.
## Warning: Removed 658 rows containing missing values (geom_point).
ggtitle("Peak month of birth by latitude and years")## $title
## [1] "Peak month of birth by latitude and years"
##
## attr(,"class")
## [1] "labels"
g = ggplot(BR, aes(x = month, y= latitude, col = year, alpha = significant_rhythm))
g + geom_point(size = 0.75) + #xlim(c(0,12)) +
geom_hline(yintercept = 0)+
scale_x_continuous(breaks = 0:12) +
scale_alpha_discrete(range = c(0.2,1))+
scale_color_gradient(low = "red4", high = "coral1")+
ggtitle("Peak month of birth by latitude and years")## Warning: Using alpha for a discrete variable is not advised.
## Warning: Removed 658 rows containing missing values (geom_point).
#mapw = map_data("world")
#g = ggplot()
#g + geom_polygon(data = mapw, aes(x=long, y = lat, group = group), fill= NA, col = "gray") +
# geom_point(data =BR, aes(x = lon + year-2000, y = lat, col = phase, alpha = (qval<=0.1)/2+0.3))
# non-transparent means significant. Phase is the month number
#rm(mapw)We indeed observe this latitude-dependency for the peak time of birth. Interestingly, for some countries in the Nothern hemisphere, we also see a shift towards later later peak time in recent years.
Having a closer look to some countries with measurements for many years:
c = which(BR$country %in% countries)
g = ggplot(BR[c,], aes(x = phase, y = year))
g + geom_point(aes(col = (qval <= 0.1))) +
facet_grid( country ~ .) +
theme_minimal() +
theme(strip.text.y = element_text(angle = 0))+
ggtitle("Shift in birth month since the 70's for many countries")# note that for some countries, the trend for the phase does change over the years
subset_countries = c("Sweden","Denmark","Finland","Canada","Germany", "France", "Switzerland", "United States","Israel","Japan")
g = ggplot(BR[BR$country %in% subset_countries, ], aes(x = phase, y = year, col = country))
g + geom_point(aes(alpha = (qval <= 0.1)/2+0.5)) +
geom_path(data = BR[(BR$country %in% subset_countries) & (BR$qval <= 0.1),], aes(x = phase, y = year, col = country)) +
xlim(c(0,12))+
theme_minimal() +
theme(strip.text.y = element_text(angle = 0))+
ggtitle("Except for the US and Israel, most countries have seen their peak month of birth being delayed")??? Maybe should first de-trend the data before computing the phase? But it does not seem to correlate with the overall trends…
We were then wondering if the amplitude of the oscillations were also dependent on latitude.
g = ggplot(BR, aes(x = rel.ampl , y = lat, col = year, alpha = (qval<=0.1)/2+0.3))
g + geom_point() + ggtitle("Relative amplitude of birth seasonality by latitude")## Warning: Removed 658 rows containing missing values (geom_point).
It does not seem to be the case.
In EU and US (which is the main population of the two apps) in recent years, the phases are quite similar (despite the relatively large variation in latitude), we can thus use Sympto and Kindara’s dataset to explore the origin of seasonality: i.e. is it a change in sexual behavior or a change in fertility?
countries.s = c("Germany", "Switzerland", "France","United States")
g = ggplot(BR[BR$country %in% countries.s,], aes(x = phase, y = year, col = country))
g + geom_point() + xlim(c(0,12)) + geom_hline(yintercept = 2010) +
ggtitle("Countries for which we have users in Sympto's dataset have similar peak month in the last decade")The peak birth month for these countries in recent years are July-August. The estimated time of conception is thus ~ November.
For Sympto, we have both location and user’s birth year (age). For Kindara, we currently do not have access to this information. Unfortunately, Sympto’s dataset is a lot smaller than Kindara’s dataset.
Clue, Kindara, Sympto
We downloaded monthly live birth data from the UN [http://data.un.org/Data.aspx?d=POP&f=tableCode%3A55]
UN_birth = read.csv(paste0(IO$birth_records_dir,"UNdata_Export_20180713_023958101_all.csv"), header = TRUE)
# removing the rows with the total number of birth - only keeping the monthly data
months = format(ISOdate(2004,1:12,1),"%B")
UN_birth = UN_birth %>% dplyr::filter(Month %in% months)
# proper date
UN_birth$date = as.Date(str_c(UN_birth$Year, "-",UN_birth$Month,"-01"), format = c("%Y-%B-%d"))
range(UN_birth$date)## [1] "1967-01-01" "2017-12-01"
# matching country names with countriesLow Names
UN_birth$Country.or.Area = gsub("United States of America","United States", UN_birth$Country.or.Area)
UN_birth$Country.or.Area = gsub("United Kingdom of Great Britain and Northern Ireland","United Kingdom", UN_birth$Country.or.Area)
# getting geo-location
mat = match(UN_birth$Country.or.Area, countriesLow$NAME)
UN_birth$lat = countriesLow$LAT[mat]
UN_birth$lon = countriesLow$LON[mat]
rm(mat)
# sorting countries by latitude
countries.levels = unique(UN_birth$Country.or.Area)
m = match(countries.levels,countriesLow$NAME )
lat = countriesLow$LAT[m]
o = order(lat, decreasing = TRUE)
countries.levels = countries.levels[o]
UN_birth$Country.or.Area = factor(UN_birth$Country.or.Area, levels = countries.levels)
# Data source
UN_birth$source = "UN"
save(UN_birth, file = paste0(IO$out_Rdata,"UN_birth_data.Rdata"))
rm(countries.levels, m, lat, o)The birth data were downloaded on Feb 21, 2018 from [http://svs.aids.gov.br/dantps/centrais-de-conteudos/infograficos/natalidade/]
BR_birth = read_csv(file = str_c(IO$r_Data,"Brazil_birth_data/","monthly_births_Brazil_detrended_2000_2017_wavelet_gamm_timing.csv"))## Parsed with column specification:
## cols(
## year = col_double(),
## location = col_character(),
## month = col_character(),
## month.number = col_double(),
## births = col_double(),
## time = col_double(),
## location.type = col_character(),
## moving.avg = col_double(),
## percent.dev.from.mean = col_double(),
## significant.seasonality = col_logical(),
## seasonal.power = col_double(),
## gamm.r2 = col_double(),
## gamm.prediction = col_double(),
## seasonal.strength = col_character(),
## peak.month = col_double()
## )
BR_birth$country = "Brazil"
BR_birth$date = str_c(BR_birth$year,"-",BR_birth$month.number,"-01") %>% as.Date()
BR_birth$reliability = "Final figure"
BR_birth$source = "Micaela" # XXX To be changed to the official source
save(BR_birth, file = paste0(IO$out_Rdata,"BR_birth_data.Rdata"))brazil.dict.list = list("North" = c("Acre", "Amapá", "Amazonas", "Pará", "Rondônia", "Roraima", "Tocantins"),
"Northeast" = c("Alagoas", "Bahia", "Ceará", "Maranhão", "Paraíba", "Pernambuco", "Piauí", "Rio Grande do Norte", "Sergipe"),
"Central-West" = c("Goiás", "Mato Grosso", "Mato Grosso do Sul", "Distrito Federal" ),
"Southeast" = c("Espírito Santo", "Minas Gerais", "Rio de Janeiro", "São Paulo"),
"South" = c("Paraná", "Rio Grande do Sul", "Santa Catarina"))
brazil.dict = data.frame(region = rep(names(brazil.dict.list), lengths(brazil.dict.list)), states = unlist(brazil.dict.list))
UN_birth = UN_birth %>% dplyr::mutate(
reliability = Reliability,
country = Country.or.Area,
area = "Whole country",
births = Value,
births_moving_avg = NA,
births_percent_dev_from_mean = NA
)
BR_birth = BR_birth %>% dplyr::mutate(
area = brazil.dict$region[match(BR_birth$location, brazil.dict$states)],
state = location,
births_moving_avg = moving.avg,
births_percent_dev_from_mean = percent.dev.from.mean
)
BR_birth_by_region = BR_birth %>% ddply(.,
.(country, area, date, source, reliability),
summarize,
births = sum(births),
births_moving_avg = NA,
births_percent_dev_from_mean = NA)
colnames = c("country","area","date","source","reliability","births","births_moving_avg","births_percent_dev_from_mean")
birth = rbind(
UN_birth %>% dplyr::select(colnames),
BR_birth_by_region %>% dplyr::select(colnames)
)
save(birth, file = paste0(IO$out_Rdata,"birth_data.Rdata"))User table:
users = read_feather(path = paste0(IO$input_clue,"users.feather"))
head(users)## # A tibble: 6 x 7
## user_id birth_year_bin country_area height_bin weight_bin
## <chr> <chr> <chr> <chr> <chr>
## 1 000320… 1970-1974 United Stat… 160-164 >=80
## 2 bb78e5… 1975-1979 France 160-164 50-54
## 3 9c1ae6… 1970-1974 France 165-169 70-74
## 4 4ac787… 1970-1974 United Stat… 170-174 >=80
## 5 1c9dfb… 1975-1979 Brazil - Ce… <NA> <NA>
## 6 33ae5a… 1970-1974 United King… 165-169 55-59
## # … with 2 more variables: latest_birth_control <chr>, csv_file <chr>
dim(users)## [1] 535154 7
cycles_00 = read_feather(path = paste0(IO$input_clue,"cycles/cycles0000.feather"))
head(cycles_00)## # A tibble: 6 x 11
## user_id cycle_nb cycle_start cycle_end cycle_length period_start
## <chr> <int> <date> <date> <int> <date>
## 1 155248… 7 2018-03-04 2018-03-28 25 2018-03-04
## 2 b89b6d… 2 2018-12-24 2019-01-16 24 2018-12-24
## 3 558762… 1 2018-02-10 2018-03-12 31 2018-02-10
## 4 c2c8fd… 7 2018-05-30 2018-07-04 36 2018-05-30
## 5 bfec0b… 23 2018-11-12 2018-12-14 33 2018-11-12
## 6 aece4d… 1 2018-04-04 2018-04-29 26 2018-04-04
## # … with 5 more variables: period_end <date>, period_length <int>,
## # neg_preg_test <lgl>, pos_preg_test <lgl>, latest_preg_test <chr>
dim(cycles_00)## [1] 287684 11
tracking_folder = paste0(IO$input_clue,"tracking/")
files = list.files(tracking_folder)
tracking = read_feather(path = paste0(tracking_folder,files[1]))
head(tracking)## # A tibble: 6 x 5
## user_id date category type number
## <chr> <date> <chr> <chr> <chr>
## 1 8dab364e26c969e3f242d1c37158d4b9… 2018-09-11 pain tender_brea… <NA>
## 2 f7b870737c60dd97a6e4a5522c79500a… 2018-01-09 mental calm <NA>
## 3 eedb725061a1eaba164713e2815b35ac… 2017-05-06 pill_hbc late <NA>
## 4 f56145c29cb39fa00b727e627be976b0… 2018-08-30 sleep 6-9 <NA>
## 5 15c407da50db2c39ef01841474798079… 2018-12-06 pill_hbc taken <NA>
## 6 d2277465eeab05b1c2d7982c097c13c9… 2017-11-29 sleep 3-6 <NA>
unique(tracking$category)## [1] "pain" "mental" "pill_hbc" "sleep"
## [5] "period" "ANY" "sex" "emotion"
## [9] "poop" "energy" "social" "ailment"
## [13] "digestion" "exercise" "fluid" "weight"
## [17] "medication" "iud" "test" "injection_hbc"
## [21] "bbt" "patch_hbc"
Workflow:
Split Country and Area
Compute estimated mean BMI
Re-assemble the tracking tables into batches, making sure a single user has all its tracking in the same batch
Add the users features to the tracking table
Label user’s time-series
Birth control (as declared by users)
Birth control (re-assignment based on users’ declaration and on their tracking history - see Breast Tenderness paper method)
User’s tracking activity (convolution by X days from “ANY” tracking to reflect that the user was using the app)
countries_areas = users$country_area %>% str_split_fixed(., " - ", n = 2)
users$country = countries_areas[,1]
users$area = countries_areas[,2]
write_feather(users, path = paste0(IO$output_clue, "users.feather"))
ok = file.copy(from = paste0(IO$output_clue, "users.feather"), to = paste0(IO$tmp_clue, "users_with_countries.feather"))users$height_bin = factor(users$height_bin, levels = dict$height$bin)
users$weight_bin = factor(users$weight_bin, levels = dict$weight$bin)
heigh_mean = dict$height$mean[match(users$height_bin,dict$height$bin)]
weight_mean = dict$weight$mean[match(users$weight_bin,dict$weight$bin)]
users$est_mean_bmi = weight_mean/((heigh_mean/100)^2)
write_feather(users, path = paste0(IO$output_clue, "users.feather"))
ok = file.copy(from = paste0(IO$output_clue, "users.feather"), to = paste0(IO$tmp_clue, "users_with_bmi.feather"))
ggplot(users, aes(x = est_mean_bmi, fill = country_area))+
geom_histogram(binwidth = 2)+
facet_grid(country_area ~ ., scale = "free_y")## Warning: Removed 153517 rows containing non-finite values (stat_bin).
max_batch_size = 5000
n_batch = max(par$min_n_batches, ceiling(nrow(users)/max_batch_size))
batch_size = ceiling(nrow(users)/n_batch)
users$batch = rep(1:n_batch, each = batch_size)[1:nrow(users)]
write_feather(users, path = paste0(IO$output_clue, "users.feather"))
ok = file.copy(from = paste0(IO$output_clue, "users.feather"), to = paste0(IO$tmp_clue, "users_with_batches.feather"))tracking_folder = paste0(IO$input_clue,"tracking/")
files = list.files(tracking_folder)
tmp_folder = paste0(IO$tmp_clue,"tracking_split_in_batches/")
if(!file.exists(tmp_folder)){
dir.create(tmp_folder,recursive = TRUE)
cl = makeCluster(par$n_cores)
registerDoParallel(cl)
tic()
users_original_file_ids = foreach(file = files, .combine = rbind, .packages = c("feather","stringr")) %dopar%
{
tracking = read_feather(path = paste0(tracking_folder, file))
dim(tracking)
tracking$batch = users$batch[match(tracking$user_id, users$user_id)]
if(nrow(tracking)>0){
for(b in unique(tracking$batch)){
tracking_batch = tracking[which(tracking$batch == b),]
new_file_name = gsub(".feather",paste0("_batch_",b,".feather"),file)
write_feather(tracking_batch, path = paste0(tmp_folder, new_file_name ))
}
users_original_file_ids = data.frame(user_id = unique(tracking$user_id), original_file_id = str_extract(file,"\\d{4}"))
}else{ users_original_file_ids = data.frame(user_id = character(), original_file_id = character())}
return(users_original_file_ids)
}
toc()
stopImplicitCluster()
users_o_f = aggregate(original_file_id ~ user_id ,users_original_file_ids, function(x) paste0(sort(x), collapse = ","))
colnames(users_o_f) = c("user_id","original_tracking_files")
write_feather(users_o_f, path = paste0(IO$tmp_clue,"original_tracking_files_per_users.feather"))
}## 453.054 sec elapsed
input_folder = paste0(IO$tmp_clue,"tracking_split_in_batches/")
output_folder = paste0(IO$output_clue,"tracking/")
if(file.exists(output_folder)){unlink(output_folder, recursive = TRUE)} ;dir.create(output_folder,recursive = TRUE)
tmp_folder = paste0(IO$tmp_clue,"tracking_in_batches/")
if(!dir.exists(tmp_folder)){
dir.create(tmp_folder,recursive = TRUE)
batches = unique(users$batch)
ok = foreach(b = batches) %do% {
cat("\t",b,"||\t")
all_files = list.files(input_folder)
files = all_files[grep(paste0("_batch_",b,"\\."),all_files)]
cl = makeCluster(par$n_cores)
registerDoParallel(cl)
tracking = foreach(file = files, .combine = rbind, .packages = "feather") %dopar%{tracking = read_feather(path = paste0(input_folder, file));return(tracking)}
stopImplicitCluster()
cols_to_add = c("birth_year_bin","country_area","height_bin","weight_bin", "est_mean_bmi")
m = match(tracking$user_id, users$user_id)
for(col_to_add in cols_to_add){
eval(parse(text = paste0("tracking$",col_to_add," = users$",col_to_add,"[m]")))
}
o = order(tracking$user_id, tracking$date, tracking$category, tracking$type, tracking$number)
tracking = tracking[o,]
new_file_name = paste0("tracking_",b,".feather")
write_feather(tracking, path = paste0(output_folder, new_file_name ))
ok = file.copy(from = paste0(output_folder, new_file_name ), to = paste0(tmp_folder, new_file_name), overwrite = TRUE)
}
}## 1 || 2 || 3 || 4 || 5 || 6 || 7 || 8 || 9 || 10 || 11 || 12 || 13 || 14 || 15 || 16 || 17 || 18 || 19 || 20 || 21 || 22 || 23 || 24 || 25 || 26 || 27 || 28 || 29 || 30 || 31 || 32 || 33 || 34 || 35 || 36 || 37 || 38 || 39 || 40 || 41 || 42 || 43 || 44 || 45 || 46 || 47 || 48 || 49 || 50 || 51 || 52 || 53 || 54 || 55 || 56 || 57 || 58 || 59 || 60 || 61 || 62 || 63 || 64 || 65 || 66 || 67 || 68 || 69 || 70 || 71 || 72 || 73 || 74 || 75 || 76 || 77 || 78 || 79 || 80 || 81 || 82 || 83 || 84 || 85 || 86 || 87 || 88 || 89 || 90 || 91 || 92 || 93 || 94 || 95 || 96 || 97 || 98 || 99 || 100 || 101 || 102 || 103 || 104 || 105 || 106 || 107 || 108 ||
if(!file.exists(paste0(IO$tmp_clue, "users_agg.feather"))){
tracking_folder = paste0(IO$output_clue,"tracking/")
files = list.files(tracking_folder)
cl = makeCluster(par$n_cores)
registerDoParallel(cl)
tic()
users_agg = foreach(file = files, .combine = rbind, .packages = c("feather","stringr", "plyr")) %dopar%
{
tracking = read_feather(path = paste0(tracking_folder, file))
users_agg = ddply(tracking,
.(user_id),
summarize,
first_obs = min(date),
last_obs = max(date),
n_obs_day = length(unique(date)),
n_obs = sum(category != "ANY"),
n_sex = sum(category == "sex"),
n_mucus = sum(category == "fluid"),
n_temp = sum(category == "bbt")
)
return(users_agg)
}
toc()
stopImplicitCluster()
write_feather(users_agg, path = paste0(IO$tmp_clue, "users_agg.feather"))
}## 2450.849 sec elapsed
users_agg = read_feather(path = paste0(IO$tmp_clue, "users_agg.feather"))
cols_to_add = setdiff(colnames(users_agg),"user_id")
m = match(users$user_id, users_agg$user_id)
for(col_to_add in cols_to_add){eval(parse(text = paste0("users$",col_to_add," = users_agg$",col_to_add,"[m]")))}
users$n_days = as.numeric(users$last_obs - users$first_obs)
write_feather(users, path = paste0(IO$output_clue, "users.feather"))
ok = file.copy(from = paste0(IO$output_clue, "users.feather"), to = paste0(IO$tmp_clue, "users_augmented.feather"))As declared by the users:
BC = read_feather(path = paste0(IO$input_clue,"birth_control.feather"))
o = order(BC$user_id, BC$date)
BC = BC[o,]
BC$birth_control = replace_na(BC$birth_control, "undefined")
write_feather(BC, path = paste0(IO$output_clue, "birth_control.feather"))BC = read_feather(path = paste0(IO$output_clue,"birth_control.feather"))
input_folder = paste0(IO$tmp_clue,"tracking_in_batches/")
output_folder = paste0(IO$output_clue,"tracking/")
tmp_folder = paste0(IO$tmp_clue, "tracking_with_user_defined_birth_control/")
if(!dir.exists(tmp_folder)){
dir.create(tmp_folder)
files = list.files(input_folder)
cl = makeCluster(par$n_cores)
registerDoParallel(cl)
tic()
catch =
foreach(file = files,
.combine = rbind,
.packages = c("feather","stringr", "plyr","tidyverse")
) %dopar% {
tracking = read_feather(path = paste0(input_folder, file))
agg = aggregate(date ~ user_id, tracking, min)
min_date = agg$date[match(tracking$user_id, agg$user_id)]
tracking = tracking %>% mutate(rel_date = as.numeric(date - min_date + 1))
u_tracking = tracking %>%
select(user_id, date) %>% unique() %>%
mutate(birth_control = NA,pill_type = NA, pill_regiment = NA)
u_BC = BC %>% select(-csv_file) %>% filter(user_id %in% unique(tracking$user_id)) %>% unique()
BC_exp = rbind(u_tracking, u_BC) %>% arrange(., user_id, date)
# need to expand the birth_control variables to replace the NAs with the latest value in the file
BC_exp = BC_exp %>% group_by(user_id) %>% mutate(
birth_control = replace_NAs_with_latest_value(birth_control) %>% replace_na("undefined"),
pill_type = replace_NAs_with_latest_value(pill_type),
pill_regiment = replace_NAs_with_latest_value(pill_regiment)
)
tracking_key = str_c(tracking$user_id, "_",tracking$date)
BC_key = str_c(BC_exp$user_id,"_",BC_exp$date)
m = match(tracking_key,BC_key)
tracking$birth_control_ud = BC_exp$birth_control[m]
tracking$pill_type_ud = BC_exp$pill_type[m]
tracking$pill_regiment_ud = BC_exp$pill_regiment[m]
write_feather(tracking, path = paste0(output_folder,file))
file.copy(from = paste0(output_folder,file), to = paste0(tmp_folder,file), overwrite = TRUE)
}
toc()
stopImplicitCluster()
}## 10996.129 sec elapsed
active_tracking_filter = function(x, N = 28*1.5){
res = stats::filter(c(rep(0,N-1),x), filter = rep(1,N), sides = 1) %>% pmin(1)
res = na.omit(res) %>% as.vector()
return(res)
}input_folder = paste0(IO$output_clue,"tracking/")
files = list.files(input_folder)
output_folder = paste0(IO$tmp_clue, "active_tracking/")
if(!dir.exists(output_folder)){
dir.create(output_folder)
time_vec = seq(min(users$first_obs), max(users$last_obs), by = 1)
cl = makeCluster(par$n_cores)
registerDoParallel(cl)
tic()
ok =
foreach(file = files,
.packages = c("dplyr", "feather")) %dopar% {
tracking = read_feather(path = paste0(input_folder, file))
any_tracking = tracking %>% filter(. , category == "ANY") %>%
select(c(user_id, date, birth_control_ud))
any_tracking$tracking_days = 1
tmp = data.frame(
user_id = rep(unique(tracking$user_id), each = length(time_vec)),
date = rep(time_vec, lu(tracking$user_id)))
active_tracking = dplyr::full_join(tmp,any_tracking, by = c("user_id","date"))
active_tracking$tracking_days[is.na(active_tracking$tracking_days)] = 0
active_tracking$birth_control_ud = ave(active_tracking$birth_control_ud,
by = active_tracking$user_id,
FUN = replace_NAs_with_latest_value)
active_tracking = active_tracking %>% group_by(user_id) %>%
mutate(., tracking = active_tracking_filter(tracking_days))
active_tracking = active_tracking %>% select(user_id, date,birth_control_ud, tracking)
# compressed table
active_tracking = active_tracking %>% group_by(user_id, birth_control_ud) %>%
mutate(transition = diff(c(0, tracking)))
active_tracking = active_tracking %>% group_by(user_id) %>%
mutate(stretch_num = cumsum(transition == 1))
active_tracking = active_tracking %>%
group_by(user_id, birth_control_ud, stretch_num, tracking) %>%
mutate(stretch_length = sum(tracking))
compressed_tracking = active_tracking %>% filter(transition == 1) %>%
select(user_id, date,birth_control_ud, stretch_num, stretch_length) %>%
rename(start_date = date)
file_name = paste0("active_",file)
write_feather(compressed_tracking, path = paste0(output_folder,file_name))
}
toc()
stopImplicitCluster()
}## 28525.397 sec elapsed
users = read_feather(path = paste0(IO$output_clue, "users.feather"))time_vec = seq(min(users$first_obs), max(users$last_obs), by = 1)input_folder = paste0(IO$output_clue,"tracking/")
files = list.files(input_folder)
input_active_tracking = paste0(IO$tmp_clue,"active_tracking/")
indicators_folder = paste0(IO$output_clue, "pop_indicators/")
if(!dir.exists(indicators_folder)){dir.create(indicators_folder)}
if(!file.exists(paste0(indicators_folder, "sex_pop_indicators.feather"))){
tic()
tracking_pop_agg = foreach(file = files, .combine = rbind, .packages = c("dplyr","tidyverse")) %do% {
cat("\t",file,"\t||")
# tracking
tracking = read_feather(path = paste0(input_folder, file))
tracking$BC = dict$BC$type[match(tracking$birth_control_ud, dict$BC$birth_control)]
tracking = filter(tracking, BC %in% c("F","I"))
# variables aggregates
tracking_pop_agg = ddply(tracking,
.(date,country_area,BC),
summarise,
n_prot_sex = sum((category == "sex") & (type == "protected_sex"), na.rm = TRUE),
n_unprot_sex = sum((category == "sex") & (type == "unprotected_sex"), na.rm = TRUE),
n_wd_sex = sum((category == "sex") & (type == "withdrawal_sex"), na.rm = TRUE),
n_exercise = sum(category == "exercise", na.rm = TRUE),
n_long_sleep = sum((category == "sleep")&(type == ">9"), na.rm = TRUE),
n_bleeding = sum((category == "period")&(type != "spotting"), na.rm = TRUE),
n_medium_bleeding = sum((category == "period") & (type == "medium"), na.rm = TRUE),
n_breast_pain = sum((category == "pain") & (type == "tender_breasts"), na.rm = TRUE),
n_pill_taken = sum((category == "pill_hbc") & (type == "taken"), na.rm = TRUE)
)
tracking_pop_agg = tracking_pop_agg %>% mutate(n_sex = n_prot_sex + n_unprot_sex + n_wd_sex)
# active tracking
active_tracking_compressed = read_feather(path = paste0(input_active_tracking,"active_",file))
active_tracking = expand_compressed_tracking(active_tracking_compressed)
active_tracking$BC = dict$BC$type[match(active_tracking$birth_control_ud, dict$BC$birth_control)]
active_tracking = filter(active_tracking, BC %in% c("F","I"))
# total number of users
active_tracking$country_area = tracking$country_area[match(active_tracking$user_id, tracking$user_id)]
active_tracking_agg = ddply(active_tracking,
.(date,country_area,BC),
summarise,
n_users = sum(tracking, na.rm = TRUE)
)
tmp = dplyr::full_join(x = active_tracking_agg , y = tracking_pop_agg, by = c("date","country_area","BC")) %>%
arrange(country_area, BC, date) %>%
replace_na(list(n_prot_sex = 0,n_unprot_sex = 0, n_wd_sex= 0, n_sex = 0,
n_party = 0, n_bleeding = 0, n_medium_bleeding = 0, n_breast_pain = 0, n_pill_taken = 0))
return(tmp)
}
toc()
tic()
# sum the results from all files
tmp = tracking_pop_agg %>% group_by(date, country_area, BC) %>%
summarise_each(.,sum) %>% arrange(country_area, BC, date)
# expand to have one row per possible date
tmp2 = expand.grid(date = time_vec, country_area = unique(tmp$country_area), BC = unique(tmp$BC))
tmp3 = dplyr::full_join(tmp, tmp2, by = c("date","country_area","BC")) %>%
arrange(country_area, BC, date) %>%
replace_na(., list(n_users = 0,n_prot_sex = 0,n_unprot_sex = 0, n_wd_sex= 0, n_sex = 0,
n_party = 0, n_bleeding = 0, n_medium_bleeding = 0, n_breast_pain = 0, n_pill_taken = 0))
# final table
tracking_pop_agg = tmp3
# save the results
write_feather(tracking_pop_agg, path = paste0(indicators_folder, "sex_pop_indicators.feather"))
toc()
}tracking_pop_agg = read_feather(path = paste0(indicators_folder, "sex_pop_indicators.feather"))
ggplot(tracking_pop_agg, aes(x = date, y = n_users, col = BC))+
geom_line()+
ggtitle("Total # of users per country and birth control")+
facet_wrap(country_area ~ .)ggplot(tracking_pop_agg, aes(x = date, y = n_sex/n_users, col = BC))+
geom_line()+
ggtitle("Relative sex freq. per country and birth control")+
facet_wrap(country_area ~ .)## Warning: Removed 491 rows containing missing values (geom_path).
ggplot(tracking_pop_agg %>% filter(date > as.Date("2017-06-30")), aes(x = date, y = n_sex/n_users, col = BC))+
geom_line()+
ggtitle("Relative sex freq. per country and birth control")+
facet_grid(country_area ~ BC)ggplot(tracking_pop_agg %>% filter(date > as.Date("2017-06-30")), aes(x = date, y = n_prot_sex/n_users, col = BC))+
geom_line()+
ggtitle("Relative PROTECTED sex freq. per country and birth control")+
facet_grid(country_area ~ BC)ggplot(tracking_pop_agg %>% filter(date > as.Date("2017-06-30")), aes(x = date, y = n_unprot_sex/n_users, col = BC))+
geom_line()+
ggtitle("Relative UNPROTECTED sex freq. per country and birth control")+
facet_grid(country_area ~ BC)ggplot(tracking_pop_agg %>% filter(date > as.Date("2017-06-30")), aes(x = date, y = n_exercise/n_users, col = BC))+
geom_line()+
ggtitle("Relative reported PHYSICAL EXERCISE freq. per country and birth control")+
facet_grid(country_area ~ BC)ggplot(tracking_pop_agg %>% filter(date > as.Date("2017-06-30")), aes(x = date, y = n_long_sleep/n_users, col = BC))+
geom_line()+
ggtitle("Relative reported SLEEP duration >9h freq. per country and birth control")+
facet_grid(country_area ~ BC)ggplot(tracking_pop_agg %>% filter(date > as.Date("2017-06-30")), aes(x = date, y = n_medium_bleeding/n_users, col = BC))+
geom_line()+
ggtitle("Relative reported MEDIUM BLEEDING freq. per country and birth control")+
facet_grid(country_area ~ BC)ggplot(tracking_pop_agg %>% filter(date > as.Date("2017-06-30")), aes(x = date, y = n_breast_pain/n_users, col = BC))+
geom_line()+
ggtitle("Relative reported BREAST PAIN freq. per country and birth control")+
facet_grid(country_area ~ BC)While the dataset hold a large number of user for each country/area, it may still not be representative of the population. One way to check that is to see if we can capture country-specific holiday in the remainder of the sexual behavior.
To test that, we will consider all sexual intercourse (protected and unprotected) and discard birth-control differences.
indicators_folder = paste0(IO$output_clue, "pop_indicators/")
tracking_pop_agg = read_feather(path = paste0(indicators_folder, "sex_pop_indicators.feather"))tt = tracking_pop_agg %>%
dplyr::select(.,-BC) %>% group_by(date, country_area) %>% summarize_each(sum) %>% arrange(country_area, date) %>%
mutate(r_sex = n_sex/n_users,
r_exercise = n_exercise/n_users,
r_long_sleep = n_long_sleep/n_users,
r_medium_bleeding = n_medium_bleeding/n_users,
r_breast_pain = n_breast_pain/n_users
)
ggplot(tt, aes(x = date, y = r_sex, col = country_area)) +
geom_line()+ facet_grid(country_area~., scale = "free_y")## Warning: Removed 78 rows containing missing values (geom_path).
ggplot(tt %>% filter(date > as.Date("2016-06-30")), aes(x = date, y = r_sex, col = country_area)) +
geom_line()+ facet_grid(country_area~., scale = "free_y")ggplot(tt %>% filter(date > as.Date("2017-06-30")), aes(x = date, y = r_sex, col = country_area)) +
geom_line()+ facet_grid(country_area~., scale = "free_y")ggplot(tt, aes(x = date, y = r_exercise, col = country_area)) +
geom_line()+ facet_grid(country_area~., scale = "free_y") + ggtitle("Physical exercise")## Warning: Removed 3227 rows containing missing values (geom_path).
ggplot(tt, aes(x = date, y = r_long_sleep, col = country_area)) +
geom_line()+ facet_grid(country_area~., scale = "free_y") + ggtitle("Slept for more than 9h")## Warning: Removed 3227 rows containing missing values (geom_path).
ggplot(tt, aes(x = date, y = r_breast_pain, col = country_area)) +
geom_line()+ facet_grid(country_area~., scale = "free_y") + ggtitle("Reported preast pain")## Warning: Removed 78 rows containing missing values (geom_path).
ggplot(tt, aes(x = date, y = r_medium_bleeding, col = country_area)) +
geom_line()+ facet_grid(country_area~., scale = "free_y") + ggtitle("Reported medium bleeding")## Warning: Removed 78 rows containing missing values (geom_path).
tt = tt %>% filter(date > as.Date("2016-06-30"))
ggplot(tt, aes(x = date, y = r_sex, col = country_area)) +
geom_line()+ facet_grid(country_area~., scale = "free_y")seasonal_decomposition_daily_signal = function(x){
x_o = x
j = max(0,which(is.na(x_o)))
if(j > (length(x_o) - 365)){ # if less than a year of data, we just don't do it
res = data.frame(x = NA, trend = NA, weekly_pattern = NA, seasonal = NA, remainder = NA)[rep(1,length(x_o)),]
return(res)
}
x = x_o[(j+1):length(x_o)]
res = data.frame(x = x)
# overall trend
if(length(x)>= 3*365){ # using stl
x_ts = ts(x, frequency = 365)
x_stl = stl(x_ts, s.window = "periodic")
res$trend = x_stl$time.series[,2] %>% as.numeric()
tmp_rem = apply(x_stl$time.series[,c(1,3)],1,sum)
}else{ # using loess
t = 1:length(x)
res$trend = predict(loess(x ~ t, span = 1)) %>% as.numeric()
tmp_rem = x - res$trend
}
# weekly oscillations
x_ts = ts(tmp_rem, frequency = 7)
x_stl = stl(x_ts, s.window = "periodic")
res$weekly_pattern = x_stl$time.series[,1]%>% as.numeric()
# seasonal trend
if(length(x)>2*365){
x_ts = ts(apply(x_stl$time.series[,2:3],1,sum), frequency = 365)
x_stl = stl(x_ts, s.window = "periodic")
res$seasonal = x_stl$time.series[,1]%>% as.numeric()
}else{
res$seasonal = 0
}
# remainder
res$remainder = res$x - res$trend - res$weekly_pattern - res$seasonal
if(j>0){
res_NAs = NA*res[rep(1,j),]
res = rbind(res_NAs, res)
}
return(res)
}sdds = foreach(country = unique(tt$country_area),
# c("France","Germany","Brazil - Central-West","United States - California","United States - Northeast"),
# unique(tt$country_area),
.combine = rbind) %do%{ #
j = which(tt$country_area == country)
x = tt$r_sex[j]
res = seasonal_decomposition_daily_signal(x)
colnames(res) = paste0("r_sex_",colnames(res))
res$country_area = country
res$date = tt$date[j]
return(res)
}
tmp = dplyr::full_join(x = tt, y = sdds %>% dplyr::select(-r_sex_x), by = c("country_area","date"))
tt = tmp
tt$r_sex_country_typical = tt$r_sex - tt$r_sex_trend - tt$r_sex_weekly_pattern
countries_areas = tt$country_area %>% str_split_fixed(.," - ",n = 2)
tt$country = countries_areas[,1]
tt$area = countries_areas[,2]
write_feather(tt, path =paste0(IO$output_clue,"pop_indicators/relative_sexual_indicators_overall_by_country.feather"))
tt = read_feather(path =paste0(IO$output_clue,"pop_indicators/relative_sexual_indicators_overall_by_country.feather"))selected_holidays = holidays %>% filter(country != "Switzerland", country != "Germany")
ggplot(tt, aes(x = date, y = r_sex_country_typical, col = area))+
geom_vline(data = holidays, aes(xintercept = date), linetype = 1, col = "gold")+
geom_line()+
ggtitle("Total # of sex; overal + weekly trends removed")+
scale_color_manual(values = c(rep("gray40",3),"coral1","slategray1"))+
facet_grid(country ~ ., scale = "free")ggplot(tt %>% filter(date >= as.Date("2017-07-01"), country != "Switzerland"), aes(x = date,y = r_sex_country_typical, col = area))+ # country_area
geom_vline(data = selected_holidays, aes(xintercept = date), linetype = 1, col = "gold")+
geom_text(data = selected_holidays, aes(x = date, y = Inf, label = date_str), hjust = 0, nudge_x = 1, vjust = 1, nudge_y = 0, col = "gold3", size = 3, check_overlap = TRUE)+
geom_line()+
ggtitle("Total # of sex; overal + weekly trends removed")+
facet_grid(country ~ ., scale = "free")+
scale_color_manual(values = c(rep("gray40",3),"coral1","slategray1"))+
xlim(c(as.Date("2017-06-30"),as.Date("2019-07-01")))+
theme(legend.position = "bottom")## Warning: Removed 75 rows containing missing values (geom_text).
ggplot(tt %>% filter(date >= as.Date("2018-07-01"), country != "Switzerland"), aes(x = date, y = r_sex_weekly_pattern, col = area))+
geom_line()+
ggtitle("Weekly trend")+
scale_color_manual(values = c(rep("gray40",3),"coral1","slategray1"))+
facet_grid(country ~ .)+ # , scale = "free" +
theme(legend.position = "bottom")tt$weekday = wday(tt$date, week_start = 1, label = TRUE)
weekly_pattern = unique(tt %>% dplyr::select(country_area, country, area, weekday, r_sex_weekly_pattern))
ggplot(weekly_pattern %>% filter(country != "Switzerland"), aes(x = weekday, y = r_sex_weekly_pattern, col = country))+
geom_line(aes(group = country))+
geom_point()+
ggtitle("Weekly pattern per country")+
facet_grid(. ~ country_area) + # , scale = "free" +
guides(col = "none")sddo = foreach(country = unique(tt$country_area),
.combine = rbind) %do%{ #
cat(country,"\n")
j = which(tt$country_area == country)
x = tt$r_exercise[j]
res = seasonal_decomposition_daily_signal(x)
colnames(res) = paste0("r_exercise","_",colnames(res))
all_feat_res = res
x = tt$r_long_sleep[j]
res = seasonal_decomposition_daily_signal(x)
colnames(res) = paste0("r_long_sleep","_",colnames(res))
all_feat_res = cbind(all_feat_res,res)
x = tt$r_medium_bleeding[j]
res = seasonal_decomposition_daily_signal(x)
colnames(res) = paste0("r_medium_bleeding","_",colnames(res))
all_feat_res = cbind(all_feat_res,res)
x = tt$r_breast_pain[j]
res = seasonal_decomposition_daily_signal(x)
colnames(res) = paste0("r_breast_pain","_",colnames(res))
all_feat_res = cbind(all_feat_res,res)
all_feat_res$country_area = country
all_feat_res$date = tt$date[j]
return(all_feat_res)
}## Brazil - Central-West
## Brazil - Northeast
## France
## United Kingdom
## United States - California
## United States - Northeast
tmp = dplyr::full_join(x = tt,
y = sddo %>% dplyr::select(-r_exercise_x, r_long_sleep_x, r_medium_bleeding_x, r_breast_pain_x),
by = c("country_area","date"))
ttt = tmp
ttt$r_exercise_country_typical = ttt$r_exercise - ttt$r_exercise_trend - ttt$r_exercise_weekly_pattern
ttt$r_long_sleep_country_typical = ttt$r_long_sleep - ttt$r_long_sleep_trend - ttt$r_long_sleep_weekly_pattern
ttt$r_medium_bleeding_country_typical = ttt$r_medium_bleeding - ttt$r_medium_bleeding_trend - ttt$r_medium_bleeding_weekly_pattern
ttt$r_breast_pain_country_typical = ttt$r_breast_pain - ttt$r_breast_pain_trend - ttt$r_breast_pain_weekly_pattern
countries_areas = ttt$country_area %>% str_split_fixed(.," - ",n = 2)
ttt$country = countries_areas[,1]
ttt$area = countries_areas[,2]
write_feather(ttt, path =paste0(IO$output_clue,"pop_indicators/relative_other_features_indicators_overall_by_country.feather"))
ttt = read_feather(path =paste0(IO$output_clue,"pop_indicators/relative_other_features_indicators_overall_by_country.feather"))selected_holidays = holidays %>% filter(country != "Switzerland", country != "Germany")
# ggplot(ttt, aes(x = date, y = r_exercise_country_typical, col = area))+
# geom_vline(data = holidays, aes(xintercept = date), linetype = 1, col = "gold")+
# geom_line()+
# ggtitle("Exercise; overal + weekly trends removed")+
# scale_color_manual(values = c(rep("gray40",3),"coral1","slategray1"))+
# facet_grid(country ~ ., scale = "free")
ggplot(ttt %>% filter(date >= as.Date("2017-07-01")),
aes(x = date,y = r_exercise_country_typical, col = area))+ # country_area
geom_vline(data = selected_holidays, aes(xintercept = date), linetype = 1, col = "gold")+
geom_text(data = selected_holidays, aes(x = date, y = Inf, label = date_str),
hjust = 0, nudge_x = 1, vjust = 1, nudge_y = 0, col = "gold3", size = 3, check_overlap = TRUE)+
geom_line()+
ggtitle("Exercise; overal + weekly trends removed")+
facet_grid(country ~ ., scale = "free")+
scale_color_manual(values = c(rep("gray40",3),"coral1","slategray1"))+
xlim(c(as.Date("2017-06-30"),as.Date("2019-07-01")))+
theme(legend.position = "bottom")## Warning: Removed 75 rows containing missing values (geom_text).
## Warning: Removed 950 rows containing missing values (geom_path).
ggplot(ttt %>% filter(date >= as.Date("2017-07-01")),
aes(x = date,y = r_long_sleep_country_typical, col = area))+ # country_area
geom_vline(data = selected_holidays, aes(xintercept = date), linetype = 1, col = "gold")+
geom_text(data = selected_holidays, aes(x = date, y = Inf, label = date_str),
hjust = 0, nudge_x = 1, vjust = 1, nudge_y = 0, col = "gold3", size = 3, check_overlap = TRUE)+
geom_line()+
ggtitle("Slept more than 9 hours; overal + weekly trends removed")+
facet_grid(country ~ ., scale = "free")+
scale_color_manual(values = c(rep("gray40",3),"coral1","slategray1"))+
xlim(c(as.Date("2017-06-30"),as.Date("2019-07-01")))+
theme(legend.position = "bottom")## Warning: Removed 75 rows containing missing values (geom_text).
## Warning: Removed 950 rows containing missing values (geom_path).
ggplot(ttt %>% filter(date >= as.Date("2017-07-01")),
aes(x = date,y = r_medium_bleeding_country_typical, col = area))+ # country_area
geom_vline(data = selected_holidays, aes(xintercept = date), linetype = 1, col = "gold")+
geom_text(data = selected_holidays, aes(x = date, y = Inf, label = date_str),
hjust = 0, nudge_x = 1, vjust = 1, nudge_y = 0, col = "gold3", size = 3, check_overlap = TRUE)+
geom_line()+
ggtitle("Medium Bleeding; overal + weekly trends removed")+
facet_grid(country ~ ., scale = "free")+
scale_color_manual(values = c(rep("gray40",3),"coral1","slategray1"))+
xlim(c(as.Date("2017-06-30"),as.Date("2019-07-01")))+
theme(legend.position = "bottom")## Warning: Removed 75 rows containing missing values (geom_text).
ggplot(ttt %>% filter(date >= as.Date("2017-07-01")),
aes(x = date,y = r_breast_pain_country_typical, col = area))+ # country_area
geom_vline(data = selected_holidays, aes(xintercept = date), linetype = 1, col = "gold")+
geom_text(data = selected_holidays, aes(x = date, y = Inf, label = date_str),
hjust = 0, nudge_x = 1, vjust = 1, nudge_y = 0, col = "gold3", size = 3, check_overlap = TRUE)+
geom_line()+
ggtitle("Breast Pain; overal + weekly trends removed")+
facet_grid(country ~ ., scale = "free")+
scale_color_manual(values = c(rep("gray40",3),"coral1","slategray1"))+
xlim(c(as.Date("2017-06-30"),as.Date("2019-07-01")))+
theme(legend.position = "bottom")## Warning: Removed 75 rows containing missing values (geom_text).
ttt$weekday = wday(ttt$date, week_start = 1, label = TRUE)
weekly_pattern_other_features = unique(ttt %>% dplyr::select(country_area, country, area, weekday,
r_exercise_weekly_pattern,
r_long_sleep_weekly_pattern,
r_medium_bleeding_weekly_pattern,
r_breast_pain_weekly_pattern))
weekly_pattern_other_features_long = tidyr::pivot_longer(weekly_pattern_other_features,
cols = c(r_exercise_weekly_pattern,
r_long_sleep_weekly_pattern,
r_medium_bleeding_weekly_pattern,
r_breast_pain_weekly_pattern),
names_to = "feature")
weekly_pattern_other_features_long$feature = weekly_pattern_other_features_long$feature %>%
str_replace(.,"r_","") %>%
str_replace(.,"_weekly_pattern","")
ggplot(weekly_pattern_other_features_long,
aes(x = weekday, y = value, col = country))+
geom_line(aes(group = country_area))+
geom_point()+
ggtitle("Weekly pattern per country")+
facet_grid(. ~ feature) # , scale = "free" +## Warning: Removed 84 rows containing missing values (geom_point).
#guides(col = "none")
weekly_pattern_sex_long = weekly_pattern %>% mutate(feature = "sex", value = r_sex_weekly_pattern) %>% dplyr::select(-r_sex_weekly_pattern)
weekly_pattern_all_features_long = rbind(weekly_pattern_other_features_long, weekly_pattern_sex_long)
ggplot(weekly_pattern_all_features_long,
aes(x = weekday, y = value, col = country))+
geom_line(aes(group = country_area))+
geom_point()+
ggtitle("Weekly pattern per country")+
facet_grid(. ~ feature) ## Warning: Removed 84 rows containing missing values (geom_point).
sb = tracking_pop_agg %>%
dplyr::filter(date >= as.Date("2017-07-01")) %>%
mutate(r_bleeding = n_bleeding/n_users,
cat = interaction(country_area,BC)) %>%
arrange(cat, date)
sdb = foreach(cat = unique(sb$cat),
.combine = rbind) %do%{ #
cat(cat %>% as.character(),"\n")
j = which(sb$cat == cat)
x = sb$r_bleeding[j]
res = seasonal_decomposition_daily_signal(x)
colnames(res) = paste0("r_bleeding","_",colnames(res))
res$cat = cat
res$date = sb$date[j]
return(res)
}## Brazil - Central-West.F
## Brazil - Northeast.F
## France.F
## United Kingdom.F
## United States - California.F
## United States - Northeast.F
## Brazil - Central-West.I
## Brazil - Northeast.I
## France.I
## United Kingdom.I
## United States - California.I
## United States - Northeast.I
tmp = dplyr::full_join(x = sb,
y = sdb %>% dplyr::select(-r_bleeding_x),
by = c("cat","date"))
sb = tmp; rm(tmp, sdb)ggplot(sb %>% dplyr::filter(date %in% seq(min(sb$date), by = 1, len = 21)),
aes(x = date, y = r_bleeding_weekly_pattern, col = BC))+
geom_line()+geom_point()+
ggtitle("Weekly pattern (from time-series decomposition)")+
scale_x_date(date_labels = "%a %b %d %y")+
facet_grid(country_area ~ .)ggplot(sb,
aes(x = date, y = r_bleeding_remainder, col = BC))+
geom_line()+
ggtitle("Detrended bleeding pattern (overal trend and weekly trend removed)")+
facet_grid(country_area ~ .)Workflow:
Construct a table that has a datapoint for each day; country; birth control category (pp)
Detrend the time-series per country and birth-control
Construct a table with the montly aggregates (pa)
all_BC = tracking_pop_agg %>% dplyr::select(-BC) %>% group_by(country_area, date) %>%
summarize_each(sum)
all_BC$BC = "all"
all_BC = all_BC %>% dplyr::select(colnames(tracking_pop_agg))
pp = rbind(tracking_pop_agg %>% as.data.frame(), all_BC %>% as.data.frame())
pp = pp %>% arrange(country_area, BC, date)
pp = pp %>% filter(pp$date >= as.Date("2016-07-01"))
ggplot(pp, aes(x = date, y = n_users, col = BC))+
geom_hline(yintercept = 1000)+
geom_line()+
ggtitle("total number of users (monthly)")+
facet_wrap(country_area~.)pp = pp %>% mutate(
r_sex = n_sex/n_users,
r_prot = n_prot_sex/n_users,
r_unprot = n_unprot_sex/n_users)pp = pp %>% filter(country_area != "Switzerland")
pp$cat = interaction(pp$country_area, pp$BC)Seasonal decomposition on the Brazilian (Central West area) time-serie.
Tests to identifying the overal trend.
xx = pp$r_sex[pp$cat == "Brazil - Central-West.all"]
x = xx
head(x)## [1] 0.10869565 0.02083333 0.12500000 0.08163265 0.08000000 0.10000000
length(x)## [1] 1095
x_ts = ts(x, frequency = 365)
x_stl = stl(x_ts, s.window = "periodic")
plot(x_stl, main = "Overal trend on raw time-series")# x_ts = ts(x, frequency = 7)
# x_stl = stl(x_ts, s.window = "periodic")
# plot(x_stl, main = "Weekly trend on raw time-series")
#
# x_ts = ts(x, frequency = 365/2)
# x_stl = stl(x_ts, s.window = "periodic")
# plot(x_stl, main = "1/2 year trend on raw time-series")Seasonal decomposition on the Brazilian (Central West area) time-serie.
x = xx
x_ts = ts(x, frequency = 365)
x_stl = stl(x_ts, s.window = "periodic")
plot(x_stl, main = "Overal trend on raw time-series")res = data.frame(x = x, trend = x_stl$time.series[,2] %>% as.numeric() )
x = res$x - res$trend
x_ts = ts(x, frequency = 7)
x_stl = stl(x_ts, s.window = "periodic")
plot(x_stl, main = "Weekly pattern on the detrended time-series")res$weekly = x_stl$time.series[,1] %>% as.numeric()
x = res$x - res$trend - res$weekly
x_ts = ts(x, frequency = 365)
x_stl = stl(x_ts, s.window = "periodic")
plot(x_stl, main = "Seasonal pattern on the detrended (overal and weekly) time-series")Detrending time-series of all countries (except Switzerland)
detrend_daily_signal = function(x){
res = data.frame(x = x)
# overall trend
x_ts = ts(x, frequency = 365)
x_stl = stl(x_ts, s.window = "periodic")
res$trend = x_stl$time.series[,2] %>% as.numeric()
res$detrended = x - res$trend
#res$detrended_shifted = res$detrended + mean(res$trend)
return(res)
}
detrended = foreach(category = unique(pp$cat), .combine = rbind) %do%{
res = foreach(feature = c("r_sex","r_prot","r_unprot"), .combine = rbind) %do%{
x = pp %>% filter(cat == category) %>% dplyr::select(feature) %>% unlist() %>% as.numeric() %>% replace_na(0)
res = detrend_daily_signal(x)
res = res %>% mutate(feature = feature,
cat = category,
date = pp$date[which(pp$cat == category)])
}
return(res)
}
detrended_wide = detrended %>%
dplyr::select(date,cat,feature,trend,detrended) %>%
tidyr::pivot_wider(names_from = feature, values_from = c(trend, detrended))
tmp = dplyr::full_join(x = pp, y = detrended_wide, by = c("cat","date"))
pp = tmp
write_feather(pp, path =paste0(IO$output_clue,"pop_indicators/relative_sexual_indicators.feather"))
ggplot(pp, aes(x = date, y = n_sex, col = BC))+
geom_line()+
facet_grid(country_area ~ ., scale = "free")
ggplot(pp, aes(x = date, y = r_sex, col = BC))+
geom_line()+
facet_grid(country_area ~ ., scale = "free")
ggplot(pp, aes(x = date, y = detrended_r_sex, col = BC))+
geom_line()+
facet_grid(country_area ~ ., scale = "free")
ggplot(pp, aes(x = date, y = detrended_r_unprot, col = BC))+
geom_line()+
facet_grid(country_area ~ ., scale = "free")
ggplot(pp %>% filter(month>=2017.5), aes(x = date, y = detrended_r_sex, col = BC))+
geom_line()+
facet_grid(country_area ~ BC, scale = "free")
ggplot(pp %>% filter(month>=2017.5), aes(x = date, y = detrended_r_unprot, col = BC))+
geom_line()+
facet_grid(country_area ~ BC, scale = "free")detrend_daily_signal = function(x){
res = data.frame(x = x, i = 1:length(x))
ll = loess(x ~ i, res, span = 365/length(x)) # yearly span
res$trend = predict(ll)
res$detrended = x - res$trend
return(res)
}
pp_full = pp
pp = pp %>% filter(date >= as.Date("2017-07-01"))
detrended = foreach(category = unique(pp$cat), .combine = rbind) %do%{
res = foreach(feature = c("r_sex","r_prot","r_unprot"), .combine = rbind) %do%{
x = pp %>% filter(cat == category) %>% dplyr::select(feature) %>% unlist() %>% as.numeric() %>% replace_na(0)
res = detrend_daily_signal(x)
res = res %>% mutate(feature = feature,
cat = category,
date = pp$date[which(pp$cat == category)])
}
return(res)
}
detrended_wide = detrended %>%
dplyr::select(date,cat,feature,trend,detrended) %>%
tidyr::pivot_wider(names_from = feature, values_from = c(trend, detrended))
tmp = dplyr::full_join(x = pp, y = detrended_wide, by = c("cat","date"))
pp = tmp
write_feather(pp, path =paste0(IO$output_clue,"pop_indicators/relative_sexual_indicators_detrended.feather"))
ggplot(pp, aes(x = date, y = n_sex, col = BC))+
geom_line()+
facet_grid(country_area ~ ., scale = "free")ggplot(pp, aes(x = date, y = r_sex, col = BC))+
geom_line()+
facet_grid(country_area ~ ., scale = "free")ggplot(pp, aes(x = date, y = detrended_r_sex, col = BC))+
geom_line()+
facet_grid(country_area ~ ., scale = "free")ggplot(pp, aes(x = date, y = detrended_r_unprot, col = BC))+
geom_line()+
facet_grid(country_area ~ ., scale = "free")pp$date_month = year(pp$date) + (month(pp$date)-1)/12
pp$month = month(pp$date)
pp$year = year(pp$date)
pa = pp %>%
dplyr::select(country_area, BC, date_month,year, month,detrended_r_sex, detrended_r_prot, detrended_r_unprot) %>%
group_by(country_area, BC, year, date_month,month) %>%
summarize_each(mean) %>%
mutate(year = floor(date_month),
date = str_c(year,"-",month,"-01") %>% as.Date(),
country = str_split_fixed(country_area, " - ",2)[,1],
area = str_split_fixed(country_area, " - ",2)[,2])
write_feather(pa, path =paste0(IO$output_clue,"pop_indicators/relative_sexual_indicators_detrended_monthly.feather"))
# pa = pa %>% mutate(
# r_sex = n_sex/n_users,
# r_prot = n_prot_sex/n_users,
# r_unprot = n_unprot_sex/n_users)
ggplot(pa, aes(x = date_month, y = detrended_r_sex, col = BC))+
geom_hline(yintercept = 0, col = "gray50")+
geom_line()+
ggtitle("Relative sex (all) counts (monthly)")+
facet_wrap(country_area ~ .)ggplot(pa, aes(x = date_month, y = detrended_r_prot, col = BC))+
geom_hline(yintercept = 0, col = "gray50")+
geom_line()+
ggtitle("Relative PROTECTED sex counts (monthly)")+
facet_wrap(country_area ~ .)ggplot(pa, aes(x = date_month, y = detrended_r_unprot, col = BC))+
geom_hline(yintercept = 0, col = "gray50")+
geom_line()+
ggtitle("Relative UNPROTECTED sex counts (monthly)")+
facet_wrap(country_area ~ .)ggplot(pa, aes(x = month, y = detrended_r_sex, col = BC, group = interaction(BC, year)))+
geom_hline(yintercept = 0, col = "gray50")+
geom_line()+ scale_x_continuous(breaks = 1:12)+
ggtitle("Relative sex (all) counts (by month)")+
facet_wrap(country_area ~ .)ggplot(pa, aes(x = month, y = detrended_r_prot, col = BC, group = interaction(BC, year)))+
geom_hline(yintercept = 0, col = "gray50")+
geom_line()+scale_x_continuous(breaks = 1:12)+
ggtitle("Relative PROTECTED sex counts (by month)")+
facet_wrap(country_area ~ .)ggplot(pa, aes(x = month, y = detrended_r_unprot, col = BC, group = interaction(BC, year)))+
geom_hline(yintercept = 0, col = "gray50")+
geom_line()+scale_x_continuous(breaks = 1:12)+
ggtitle("Relative UNPROTECTED sex counts (by month)")+
facet_wrap(country_area ~ .)ggplot(pa, aes(x = month, y = detrended_r_sex, col = BC, group = interaction(BC, year)))+
geom_hline(yintercept = 0, col = "gray50")+
geom_line()+ scale_x_continuous(breaks = 1:12)+
ggtitle("Relative sex (all) counts (by month)")+
facet_grid(country_area ~ BC)ggplot(pa, aes(x = month, y = detrended_r_prot, col = BC, group = interaction(BC, year)))+
geom_hline(yintercept = 0, col = "gray50")+
geom_line()+scale_x_continuous(breaks = 1:12)+
ggtitle("Relative PROTECTED sex counts (by month)")+
facet_grid(country_area ~ BC)ggplot(pa, aes(x = month, y = detrended_r_unprot, col = BC, group = interaction(BC, year)))+
geom_hline(yintercept = 0, col = "gray50")+
geom_line()+scale_x_continuous(breaks = 1:12)+
ggtitle("Relative UNPROTECTED sex counts (by month)")+
facet_grid(country_area ~ BC)users = read_feather(path = paste0(IO$output_clue, "users.feather"))users$country_area = factor(users$country_area, levels = names(sort(table(users$country_area))))
ggplot(users, aes(x = reorder(country_area,desc(country_area)), fill = country_area)) + coord_flip() + geom_bar() + guides(fill = FALSE) +
xlab("") + ylab("# of users") + ggtitle("# of users per country") ggplot(users, aes(x = birth_year_bin, fill = country_area)) + geom_bar() + facet_grid(country_area ~ ., scale = "free_y") + guides(fill = FALSE) + xlab("") + ylab("# of users") +
ggtitle("Birth year (Age) distribution")ggplot(users, aes(x = height_bin, fill = country_area)) + geom_bar() + facet_grid(country_area ~ ., scale = "free_y") + guides(fill = FALSE) + xlab("") + ylab("# of users") +
ggtitle("Users' height distribution")ggplot(users, aes(x = weight_bin, fill = country_area)) + geom_bar() + facet_grid(country_area ~ ., scale = "free_y") + guides(fill = FALSE)+ xlab("") + ylab("# of users") +
ggtitle("Users' weight distribution")ggplot(users, aes(x = est_mean_bmi, fill = country_area)) + geom_histogram(binwidth = 2) + facet_grid(country_area ~ ., scale = "free_y") + guides(fill = FALSE)+ xlab("") + ylab("# of users") +
ggtitle("Users' approx. BMI distribution")## Warning: Removed 153517 rows containing non-finite values (stat_bin).
users$latest_birth_control = factor(users$latest_birth_control, levels = names(sort(table(users$latest_birth_control))))
ggplot(users, aes(x = latest_birth_control, fill = country_area)) + coord_flip() + geom_bar()+ guides(fill = FALSE) + xlab("") + ylab("# of users") + ggtitle("Distribution of Birth Control")tt = read_feather(str_c(IO$output_clue,"pop_indicators/","relative_sexual_indicators_overall_by_country.feather"))
selected_holidays = holidays %>% dplyr::filter(country %in% unique(tt$country))
load(str_c(IO$out_Rdata,"birth_data.Rdata"), verbose = TRUE)## Loading objects:
## birth
pa = read_feather(path = str_c(IO$output_clue,"pop_indicators/relative_sexual_indicators_detrended_monthly.feather"))for(Country_Area in unique(tt$country_area)){
Country = unique(tt$country[tt$country_area == Country_Area])
Area = unique(tt$area[tt$country_area == Country_Area])
# Birth data
this_birth = birth %>% dplyr::filter(country == Country, (area == Area) | (area == "Whole country"))
max_date = max(this_birth$date)
if(month(max_date)<6){max_year = year(max_date)-1}else{max_year = year(max_date)}
date_range = c(str_c(max_year - 2,"-07-01"),str_c(max_year,"-07-01")) %>% as.Date
this_birth_last_2_years = this_birth %>% dplyr::filter( date >= min(date_range), date < max(date_range))
g_birth = ggplot(this_birth_last_2_years, aes(x = date, y = births,
col = area,
group = interaction(source, area, reliability)))+
geom_line()+
scale_x_date(limits = date_range, expand = c(0,0))+
ggtitle(str_c("Births - ",Country_Area))+
facet_grid(area ~ ., scale = "free")
g_birth
# Shifted Birth data (= conceptions)
this_conception_last_2_years = this_birth %>% mutate(date = date - months(8)) %>%
dplyr::filter( date >= min(date_range), date < max(date_range))
g_conception = ggplot(this_conception_last_2_years, aes(x = date, y = births,
col = area,
group = interaction(source, area, reliability)))+
geom_line()+
facet_grid(area ~ ., scale = "free")+
ggtitle("Shifted births (conceptions)")+
scale_x_date(limits = date_range, expand = c(0,0))
g_conception
# Monthly aggregate sexual patterns
this_pa = pa %>% dplyr::filter(country_area == Country_Area, BC == "F")
g_sex_monthly = ggplot(this_pa, aes(x = date, y = detrended_r_unprot, col = area))+
geom_hline(yintercept = 0, col = "gray80")+
geom_line()+
scale_x_date(limits = c(as.Date("2017-06-30"),as.Date("2019-07-01")),expand = c(0,0))+
facet_grid(area ~ ., scale = "free")
g_sex_monthly
# Sexual patterns
this_tt = tt %>% dplyr::filter(country_area == Country_Area, date >= as.Date("2017-07-01"))
this_holidays = holidays %>% dplyr::filter(country == Country)
g_typical = ggplot(this_tt, aes(x = date, y = r_sex_country_typical, col = area))+
geom_vline(data = this_holidays, aes(xintercept = date), linetype = 1, col = "gold")+
geom_text(data = this_holidays, aes(x = date, y = Inf, label = date_str),
hjust = 0, nudge_x = 1, vjust = 1, nudge_y = 0, col = "gold3", size = 3, check_overlap = TRUE)+
geom_line()+
ggtitle("Total # of sex; overal + weekly trends removed")+
ylab("Detrended sex")+
facet_grid(area ~ ., scale = "free")+
scale_color_manual(values = c(rep("gray40",1),"coral1","slategray1"))+
scale_x_date(limits = c(as.Date("2017-06-30"),as.Date("2019-07-01")),expand = c(0,0))
g_typical
plotlist = list(g_birth, g_conception, g_sex_monthly, g_typical)
g = plot_grid(plotlist = plotlist, ncol=1, align="v")
print(g)
}## Warning: Removed 22 rows containing missing values (geom_text).
## Warning: Removed 22 rows containing missing values (geom_text).
## Warning: Removed 22 rows containing missing values (geom_text).
## Warning: Removed 17 rows containing missing values (geom_text).
## Warning: Removed 14 rows containing missing values (geom_text).
## Warning: Removed 14 rows containing missing values (geom_text).